This report includes the automated proecesses of data verification, validation, quality assessment, and preliminary analysis for a Water Quality Program’s Black Lake Pollution Identification and Correction Project. QA related to proper field and laboratory processes are conducted independently of this report. Code is included for transparency and reproducibility.

If the decision to accept or reject a value is unclear, the option most protective of public health will be chosen.

library(ggplot2)
library(plotly)
library(plyr)
library(flexdashboard)
library(tidyverse)
library(lubridate)
library(leaflet)
library(htmltools)
library(leaflet.extras)
library(leaflet.providers)
library(datasets)
library(sf)
library(crosstalk)
library(dplyr)
library(reactable)
library(mapview)
library(httr)
library(bslib)
library(xfun)

#Plotly
devtools::install_github("ropensci/plotly")
library(plotly)
library(RCurl)
#df <- read.csv("https://raw.githubusercontent.com/AlyssaPeter/R-Code-Sample/main/EIM_Black_Lake_PIC_Project_Data.csv", stringsAsFactors = TRUE, fileEncoding = "ISO-8859-1")

df <- read.csv("O:/EH_Health/Surface Water/+ PIC/Projects/Black Lake Grant 2022-25/Data/EIM_Black_Lake_PIC_Project_Data.csv", stringsAsFactors=FALSE, fileEncoding="latin1")
df #load dataframe

Data Verification

Basic Checks Check the dataframe for null values.

null_values <- sapply(df, function(x) sum(is.na(x)))
null_values
##          Ã.........Study_ID                 Location_ID 
##                           0                           0 
##  Study_Specific_Location_ID       Field_Collection_Type 
##                           0                           0 
##             Field_Collector Field_Collection_Start_Date 
##                           0                           0 
##    Field_Collection_Comment    Latitude_Decimal_Degrees 
##                           0                           0 
##   Longitude_Decimal_Degrees                   Sample_ID 
##                           0                           0 
##   Sample_Field_Replicate_ID       Sample_Replicate_Flag 
##                           0                           0 
##       Sample_Composite_Flag               Sample_Matrix 
##                           0                           0 
##               Sample_Source    Sample_Collection_Method 
##                           0                           0 
##   Sample_Preparation_Method       Result_Parameter_Name 
##                         532                           0 
## Result_Parameter_CAS_Number           Lab_Analysis_Date 
##                           0                           0 
##                Result_Value          Result_Value_Units 
##                           0                           0 
##      Result_Reporting_Limit Result_Reporting_Limit_Type 
##                           0                         532 
##      Result_Detection_Limit Result_Detection_Limit_Type 
##                           0                           0 
##       Result_Data_Qualifier           Fraction_Analyzed 
##                           0                           0 
##               Result_Method             Result_Lab_Name 
##                           0                           0
#Sometimes values are input to data table with trailing spaces. Ensure that all columns have valid values.

#For example:
unique(df$Study_Specific_Location_ID)
##  [1] "BLA001"   "BLA003"   "BLA004"   "BLA005"   "BLA006"   "BLA007"  
##  [7] "BLA002"   "BLA008"   "BLA009"   "BLA011"   "BLA00201" "BLA00202"
## [13] "BLA00203" "BLA00301" "BLA00204" "BLA00501" "BLA00502" "BLA00701"
## [19] "BLA00702" "BLA00703" "BLA00704" "BLA00705" "BLA00706" "BLA00707"
## [25] "BLA00708" "BLA00709" "BLA00102" "BLA00103" "BLA00104" "BLA00101"
## [31] "BLA00105" "BLA00902" "BLA00903" "BLA00904" "BLA00905" "BLA00601"
## [37] "BLA00602" "BLA00710" "BLA00712"
#  [1] "BLA001"   "BLA003"   "BLA004"   "BLA005"   "BLA006"   "BLA007"   "BLA002"   "BLA008"   "BLA009"   "BLA00201" "BLA00202"
# [12] "BLA00203" "BLA00204" "BLA00301" "BLA00501" "BLA00502" "BLA00701" "BLA00702" "BLA00703" "BLA00704" "BLA00705" "BLA00706"
# [23] "BLA00707" "BLA00708" "BLA00709" "BLA00102" "BLA00103" "BLA00104" "BLA00101" "BLA00105" "BLA007 " 

#I can see that the the column Study Specific Location ID has a repeat site ID of BLA007 with a trailing space. I will then identify the record(s) with the trailing space and fix them.

df$Study_Specific_Location_ID <- trimws(df$Study_Specific_Location_ID, which = c("right"))
unique(df$Study_Specific_Location_ID) #check again, BLA007 with trailing space has been corrected
##  [1] "BLA001"   "BLA003"   "BLA004"   "BLA005"   "BLA006"   "BLA007"  
##  [7] "BLA002"   "BLA008"   "BLA009"   "BLA011"   "BLA00201" "BLA00202"
## [13] "BLA00203" "BLA00301" "BLA00204" "BLA00501" "BLA00502" "BLA00701"
## [19] "BLA00702" "BLA00703" "BLA00704" "BLA00705" "BLA00706" "BLA00707"
## [25] "BLA00708" "BLA00709" "BLA00102" "BLA00103" "BLA00104" "BLA00101"
## [31] "BLA00105" "BLA00902" "BLA00903" "BLA00904" "BLA00905" "BLA00601"
## [37] "BLA00602" "BLA00710" "BLA00712"
#  [1] "BLA001"   "BLA003"   "BLA004"   "BLA005"   "BLA006"   "BLA007"   "BLA002"   "BLA008"   "BLA009"   "BLA00201" "BLA00202"
# [12] "BLA00203" "BLA00204" "BLA00301" "BLA00501" "BLA00502" "BLA00701" "BLA00702" "BLA00703" "BLA00704" "BLA00705" "BLA00706"
# [23] "BLA00707" "BLA00708" "BLA00709" "BLA00102" "BLA00103" "BLA00104" "BLA00101" "BLA00105"

#repeat for any other variables of concern
unique(df$Result_Parameter_Name)
## [1] "Total Phosphorus" "E. coli"
# [1] "E.coli"           "Total Nitrogen "  "Total Phosphorus"  #TN has a trailing space that needs to be fixed

Spatial Errors Check if Latitude and Longitude are the same per each Study_Specific_Location_ID. If the return is “TRUE”, then results are okay.

location_coords_consistency <- df %>%
  group_by(Study_Specific_Location_ID) %>%
  summarise(
    consistent_latitude = n_distinct(Latitude_Decimal_Degrees) == 1,
    consistent_longitude = n_distinct(Longitude_Decimal_Degrees) == 1
  )
location_coords_consistency
## # A tibble: 39 × 3
##    Study_Specific_Location_ID consistent_latitude consistent_longitude
##    <chr>                      <lgl>               <lgl>               
##  1 BLA001                     TRUE                TRUE                
##  2 BLA00101                   TRUE                TRUE                
##  3 BLA00102                   TRUE                TRUE                
##  4 BLA00103                   TRUE                TRUE                
##  5 BLA00104                   TRUE                TRUE                
##  6 BLA00105                   TRUE                TRUE                
##  7 BLA002                     TRUE                TRUE                
##  8 BLA00201                   TRUE                TRUE                
##  9 BLA00202                   TRUE                TRUE                
## 10 BLA00203                   TRUE                TRUE                
## # ℹ 29 more rows

FALSE values are observed for some sites. First, define correct values.

# Define the mappings for Latitude and Longitude by Study_Specific_Location_ID
latitude_mapping <- c(
  BLA001 = 46.999244, BLA002 = 46.988902, BLA003 = 46.9891, BLA004 = 46.980352,
  BLA005 = 46.9794, BLA006 = 47.00784, BLA007 = 46.980353, BLA008 = 47.009801,
  BLA009 = 46.996677, BLA00201 = 46.98639, BLA00202 = 46.98759, BLA00203 = 46.99045,
  BLA00204 = 46.993164, BLA00301 = 46.98804, BLA00501 = 46.979515, BLA00502 = 46.977944, BLA00701 = 46.98082,
  BLA00702 = 46.97965, BLA00703 = 46.97973, BLA00704 = 46.97975, BLA00705 = 46.97927, BLA00706 = 46.97915, 
  BLA00707 = 46.97786, BLA00708 = 46.97975, BLA00709 = 46.98072, BLA00101 = 47.00011, BLA00102 = 46.9989607,
  BLA00103 = 47.0011206, BLA00104 = 47.0017544, BLA00105 = 47.0034, BLA00601 = 47.0088, BLA00902 = 46.9945, BLA00903 = 46.99592, BLA00904 = 46.99712, BLA00905 = 46.99808
)

longitude_mapping <- c(
  BLA001 = -122.971982, BLA002 = -122.964812, BLA003 = -122.96155, BLA004 = -122.975222,
  BLA005 = -122.9714, BLA006 = -122.97618, BLA007 = -122.9882787, BLA008 = -122.965046,
  BLA009 = -122.984555, BLA00201 = -122.97224, BLA00202 = -122.96935, BLA00203 = -122.96175,
  BLA00204 = -122.957002, BLA00301 = -122.96062, BLA00501 = -122.971299, BLA00502 = -122.968398, BLA00701 = -122.98925,
  BLA00702 = -122.99265, BLA00703 = -122.99271, BLA00704 = -122.99252, BLA00705 = -122.99153, BLA00706 = -122.99149, 
  BLA00707 = -122.99137, BLA00708 = -122.99429, BLA00709 = -122.98933, BLA00101 = -122.96937, BLA00102 = -122.9687191,
  BLA00103 = -122.965938, BLA00104 = -122.9657841, BLA00105 = -122.96321, BLA00601 = -122.97644, BLA00902 = -122.98056, BLA00903 = -122.9839, BLA00904 = -122.98515, BLA00905 = -122.98692
)

Update the lat/long to the defined values:

df$Latitude_Decimal_Degrees <- ifelse(df$Study_Specific_Location_ID %in% names(latitude_mapping), latitude_mapping[df$Study_Specific_Location_ID], df$Latitude_Decimal_Degrees)

df$Longitude_Decimal_Degrees <- ifelse(df$Study_Specific_Location_ID %in% names(longitude_mapping), longitude_mapping[df$Study_Specific_Location_ID], df$Longitude_Decimal_Degrees)

Create a quick map to check visually that coordinates are now correct.

df$Longitude_Decimal_Degrees = as.numeric(as.character(df$Longitude_Decimal_Degrees)) #lat/long as numeric

map <- df %>%
  select(Study_Specific_Location_ID, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees)

map <- print(distinct(map))
##    Study_Specific_Location_ID Latitude_Decimal_Degrees
## 1                      BLA001                 46.99924
## 2                      BLA003                 46.98910
## 3                      BLA004                 46.98035
## 4                      BLA005                 46.97940
## 5                      BLA006                 47.00784
## 6                      BLA007                 46.98035
## 7                      BLA002                 46.98890
## 8                      BLA008                 47.00980
## 9                      BLA009                 46.99668
## 10                     BLA011                 46.92828
## 11                   BLA00201                 46.98639
## 12                   BLA00202                 46.98759
## 13                   BLA00203                 46.99045
## 14                   BLA00301                 46.98804
## 15                   BLA00204                 46.99316
## 16                   BLA00501                 46.97951
## 17                   BLA00502                 46.97794
## 18                   BLA00701                 46.98082
## 19                   BLA00702                 46.97965
## 20                   BLA00703                 46.97973
## 21                   BLA00704                 46.97975
## 22                   BLA00705                 46.97927
## 23                   BLA00706                 46.97915
## 24                   BLA00707                 46.97786
## 25                   BLA00708                 46.97975
## 26                   BLA00709                 46.98072
## 27                   BLA00102                 46.99896
## 28                   BLA00103                 47.00112
## 29                   BLA00104                 47.00175
## 30                   BLA00101                 47.00011
## 31                   BLA00105                 47.00340
## 32                   BLA00902                 46.99450
## 33                   BLA00903                 46.99592
## 34                   BLA00904                 46.99712
## 35                   BLA00905                 46.99808
## 36                   BLA00601                 47.00880
## 37                   BLA00602                 47.00785
## 38                   BLA00710                 46.98029
## 39                   BLA00712                 46.98012
##    Longitude_Decimal_Degrees
## 1                  -122.9720
## 2                  -122.9616
## 3                  -122.9752
## 4                  -122.9714
## 5                  -122.9762
## 6                  -122.9883
## 7                  -122.9648
## 8                  -122.9650
## 9                  -122.9846
## 10                 -123.0081
## 11                 -122.9722
## 12                 -122.9694
## 13                 -122.9617
## 14                 -122.9606
## 15                 -122.9570
## 16                 -122.9713
## 17                 -122.9684
## 18                 -122.9892
## 19                 -122.9926
## 20                 -122.9927
## 21                 -122.9925
## 22                 -122.9915
## 23                 -122.9915
## 24                 -122.9914
## 25                 -122.9943
## 26                 -122.9893
## 27                 -122.9687
## 28                 -122.9659
## 29                 -122.9658
## 30                 -122.9694
## 31                 -122.9632
## 32                 -122.9806
## 33                 -122.9839
## 34                 -122.9852
## 35                 -122.9869
## 36                 -122.9764
## 37                 -122.9762
## 38                 -122.9889
## 39                 -122.9902
map <- map %>%
  na.omit()
map
##    Study_Specific_Location_ID Latitude_Decimal_Degrees
## 1                      BLA001                 46.99924
## 2                      BLA003                 46.98910
## 3                      BLA004                 46.98035
## 4                      BLA005                 46.97940
## 5                      BLA006                 47.00784
## 6                      BLA007                 46.98035
## 7                      BLA002                 46.98890
## 8                      BLA008                 47.00980
## 9                      BLA009                 46.99668
## 10                     BLA011                 46.92828
## 11                   BLA00201                 46.98639
## 12                   BLA00202                 46.98759
## 13                   BLA00203                 46.99045
## 14                   BLA00301                 46.98804
## 15                   BLA00204                 46.99316
## 16                   BLA00501                 46.97951
## 17                   BLA00502                 46.97794
## 18                   BLA00701                 46.98082
## 19                   BLA00702                 46.97965
## 20                   BLA00703                 46.97973
## 21                   BLA00704                 46.97975
## 22                   BLA00705                 46.97927
## 23                   BLA00706                 46.97915
## 24                   BLA00707                 46.97786
## 25                   BLA00708                 46.97975
## 26                   BLA00709                 46.98072
## 27                   BLA00102                 46.99896
## 28                   BLA00103                 47.00112
## 29                   BLA00104                 47.00175
## 30                   BLA00101                 47.00011
## 31                   BLA00105                 47.00340
## 32                   BLA00902                 46.99450
## 33                   BLA00903                 46.99592
## 34                   BLA00904                 46.99712
## 35                   BLA00905                 46.99808
## 36                   BLA00601                 47.00880
## 37                   BLA00602                 47.00785
## 38                   BLA00710                 46.98029
## 39                   BLA00712                 46.98012
##    Longitude_Decimal_Degrees
## 1                  -122.9720
## 2                  -122.9616
## 3                  -122.9752
## 4                  -122.9714
## 5                  -122.9762
## 6                  -122.9883
## 7                  -122.9648
## 8                  -122.9650
## 9                  -122.9846
## 10                 -123.0081
## 11                 -122.9722
## 12                 -122.9694
## 13                 -122.9617
## 14                 -122.9606
## 15                 -122.9570
## 16                 -122.9713
## 17                 -122.9684
## 18                 -122.9892
## 19                 -122.9926
## 20                 -122.9927
## 21                 -122.9925
## 22                 -122.9915
## 23                 -122.9915
## 24                 -122.9914
## 25                 -122.9943
## 26                 -122.9893
## 27                 -122.9687
## 28                 -122.9659
## 29                 -122.9658
## 30                 -122.9694
## 31                 -122.9632
## 32                 -122.9806
## 33                 -122.9839
## 34                 -122.9852
## 35                 -122.9869
## 36                 -122.9764
## 37                 -122.9762
## 38                 -122.9889
## 39                 -122.9902
# Create a leaflet map
map1 <- leaflet(map) %>%
  addTiles() %>%  # Add default OpenStreetMap tiles
  addMarkers(lng = ~Longitude_Decimal_Degrees, lat = ~Latitude_Decimal_Degrees, popup = ~paste(map$Study_Specific_Location_ID))
map1

If coordinates are now correct, overwrite the csv:

write.csv(df, file="O:/EH_Health/Surface Water/+ PIC/Projects/Black Lake Grant 2022-25/Data/EIM_Black_Lake_PIC_Project_Data.csv", row.names=FALSE)

Result Parameter CAS Number

# find location of missing values
print("Position of missing values ")
## [1] "Position of missing values "
which(is.na(df$Result_Parameter_CAS_Number))
## integer(0)
# count total missing values 
print("Count of total missing values")
## [1] "Count of total missing values"
sum(df$Result_Parameter_CAS_Number== "")
## [1] 215

215 rows are missing Result Parameter CAS Number. This should be corrected

Result Data Qualifier When Result Value = 2420, qualifier should be “G” unless “REJ”

underestimates <- df %>% filter(df$Result_Value==2420)
qualifier_table <- underestimates %>% select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value, Result_Data_Qualifier)
print(qualifier_table)
##   Study_Specific_Location_ID Field_Collection_Start_Date Result_Value
## 1                     BLA006                    7/5/2023         2420
## 2                   BLA00301                   8/31/2023         2420
## 3                   BLA00301                   9/21/2023         2420
## 4                   BLA00705                   2/12/2024         2420
## 5                     BLA007                   2/28/2024         2420
##   Result_Data_Qualifier
## 1                     G
## 2                     G
## 3                     G
## 4                     G
## 5                     G

Add qualifier G to any 2420 records without it.

When Result Value = 1, qualifier should be “E” unless “REJ”

overestimates <- df %>% filter(df$Result_Value==1)
qualifier_tablea <- overestimates %>% select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value, Result_Data_Qualifier)
print(qualifier_tablea)
##   Study_Specific_Location_ID Field_Collection_Start_Date Result_Value
## 1                     BLA008                  12/14/2023            1
## 2                   BLA00705                   1/29/2024            1
## 3                   BLA00702                   2/20/2024            1
## 4                   BLA00703                   2/20/2024            1
## 5                   BLA00102                   1/20/2024            1
## 6                     BLA008                   3/27/2024            1
## 7                   BLA00712                    4/1/2024            1
##   Result_Data_Qualifier
## 1                     E
## 2                     E
## 3                     E
## 4                     E
## 5                   REJ
## 6                   REJ
## 7                   REJ

All records need qualifier “E” added

Data Validation

Calculate the number of QA samples taken. The result should be 0.10 or greater (at least 10% of the total).

#After identifying REJ records, run the following code again:
#df_REJ <- df[df$Result_Data_Qualifier != "REJ", ]

replicate_flag_counts <- df %>%
  summarise(
    total_count = n(),
    replicate_count = sum(Sample_Replicate_Flag == "Y"),
    replicate_proportion = replicate_count / total_count
  )
replicate_flag_counts
##   total_count replicate_count replicate_proportion
## 1         532              60             0.112782

Based on results, provide a recommendation to the team to increase/decrease/maintain QA sample frequency.

Implementation Anomalies Check that sample values are within expected range. If no records are returned, no values are out of expected range. Any values that are returned, please review and accept/reject accordingly.

##Check expected values
#E.coli is expected to be between 1 and 2419.6
#TP is expected to be between 0.005 and 1000
# Filter records for Total Phosphorus and E. coli that don't meet the criteria
invalid_records <- df %>%
  filter(
    (Result_Parameter_Name == "Total Phosphorus" & (Result_Value < 0.005 | Result_Value > 1000)) |
      (Result_Parameter_Name == "E. coli" & (Result_Value < 1 | Result_Value > 2420))
  )

# Display the records that fall out of bounds
invalid_records
##  [1] Ã.........Study_ID          Location_ID                
##  [3] Study_Specific_Location_ID  Field_Collection_Type      
##  [5] Field_Collector             Field_Collection_Start_Date
##  [7] Field_Collection_Comment    Latitude_Decimal_Degrees   
##  [9] Longitude_Decimal_Degrees   Sample_ID                  
## [11] Sample_Field_Replicate_ID   Sample_Replicate_Flag      
## [13] Sample_Composite_Flag       Sample_Matrix              
## [15] Sample_Source               Sample_Collection_Method   
## [17] Sample_Preparation_Method   Result_Parameter_Name      
## [19] Result_Parameter_CAS_Number Lab_Analysis_Date          
## [21] Result_Value                Result_Value_Units         
## [23] Result_Reporting_Limit      Result_Reporting_Limit_Type
## [25] Result_Detection_Limit      Result_Detection_Limit_Type
## [27] Result_Data_Qualifier       Fraction_Analyzed          
## [29] Result_Method               Result_Lab_Name            
## <0 rows> (or 0-length row.names)

Relative Percent Difference

Check the Relative Percent Difference (RPD) for E. coli and phosphorus samples.Samples outside defined thresholds may be rejected.

\[ RPD = \frac{\lvert{R1-R2}\rvert}{\frac{R1+R2}{2}}\times100 \] Total Phosphorus: TP QA sample values must be within 20% of the sample. OR if TP<0.025, sample is accepted even if they fall outside the 20% range.

# Filter dataframe by result_parameter_name = 'Total Phosphorus'
total_phosphorus_data <- df %>%
  filter(Result_Parameter_Name == "Total Phosphorus")

# Filter for Sample_Replicate_Flag == "Y"
replicate_Y_data <- total_phosphorus_data %>%
  filter(Sample_Replicate_Flag == "Y")

# Pair rows for Sample_Replicate_Flag == "Y" with Sample_Replicate_Flag == "N"
paired_data <- replicate_Y_data %>%
  left_join(total_phosphorus_data %>%
              filter(Sample_Replicate_Flag == "N"),
            by = c("Study_Specific_Location_ID", "Field_Collection_Start_Date"),
            suffix = c("_Y", "_N"))

# Calculate the relative percent difference between the two paired values
paired_data <- paired_data %>%
  mutate(relative_percent_difference = abs(Result_Value_Y - Result_Value_N) / ((Result_Value_Y + Result_Value_N) / 2) * 100)

# Return a table showing both paired values, Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value, and calculated relative percent difference values
TP_result_table <- paired_data %>%
  select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value_Y, Result_Value_N, relative_percent_difference)

# Print the table
print(TP_result_table)
##    Study_Specific_Location_ID Field_Collection_Start_Date Result_Value_Y
## 1                      BLA003                   7/12/2023          0.041
## 2                      BLA004                   7/20/2023          0.041
## 3                      BLA005                   7/25/2023          0.039
## 4                      BLA007                   9/14/2023          0.039
## 5                      BLA008                  10/12/2023          0.015
## 6                      BLA009                  10/12/2023          0.024
## 7                      BLA001                  12/14/2023          0.028
## 8                      BLA009                  12/14/2023          0.015
## 9                      BLA002                   1/11/2024          0.028
## 10                     BLA004                   1/11/2024          0.018
## 11                     BLA009                   1/11/2024          0.018
## 12                   BLA00301                   1/16/2024          0.039
## 13                   BLA00702                   1/29/2024          0.016
## 14                     BLA002                   1/29/2024          0.031
## 15                   BLA00204                    1/9/2024          0.043
## 16                   BLA00705                   1/22/2024          0.022
## 17                   BLA00202                   1/22/2024          0.032
## 18                   BLA00201                    2/5/2024          0.028
## 19                     BLA007                    2/5/2024          0.024
## 20                   BLA00706                   2/12/2024          0.013
## 21                   BLA00101                   2/12/2024          0.024
## 22                   BLA00101                   2/12/2024          0.024
## 23                   BLA00102                   2/20/2024          0.006
## 24                     BLA002                   2/28/2024          0.034
## 25                     BLA003                   2/28/2024          0.032
## 26                     BLA004                   2/28/2024          0.023
## 27                     BLA001                    3/4/2024          0.020
##    Result_Value_N relative_percent_difference
## 1           0.041                    0.000000
## 2           0.041                    0.000000
## 3           0.050                   24.719101
## 4           0.041                    5.000000
## 5           0.015                    0.000000
## 6           0.023                    4.255319
## 7           0.030                    6.896552
## 8           0.014                    6.896552
## 9           0.030                    6.896552
## 10          0.018                    0.000000
## 11             NA                          NA
## 12          0.034                   13.698630
## 13          0.015                    6.451613
## 14          0.031                    0.000000
## 15          0.043                    0.000000
## 16          0.022                    0.000000
## 17          0.032                    0.000000
## 18          0.026                    7.407407
## 19          0.023                    4.255319
## 20          0.014                    7.407407
## 21          0.023                    4.255319
## 22          0.015                   46.153846
## 23          0.006                    0.000000
## 24             NA                          NA
## 25          0.033                    3.076923
## 26          0.022                    4.444444
## 27          0.020                    0.000000
# Assign a group to each row based on the average of the paired Result_Value
TP_result_table <- TP_result_table %>%
  mutate(
    group = ifelse((Result_Value_Y + Result_Value_N) / 2 < 0.025, "accepted", "evaluate"),
    highlight = ifelse(relative_percent_difference > 20, "highlight", "")
  )

# Print the table with group and highlight columns
print(TP_result_table)
##    Study_Specific_Location_ID Field_Collection_Start_Date Result_Value_Y
## 1                      BLA003                   7/12/2023          0.041
## 2                      BLA004                   7/20/2023          0.041
## 3                      BLA005                   7/25/2023          0.039
## 4                      BLA007                   9/14/2023          0.039
## 5                      BLA008                  10/12/2023          0.015
## 6                      BLA009                  10/12/2023          0.024
## 7                      BLA001                  12/14/2023          0.028
## 8                      BLA009                  12/14/2023          0.015
## 9                      BLA002                   1/11/2024          0.028
## 10                     BLA004                   1/11/2024          0.018
## 11                     BLA009                   1/11/2024          0.018
## 12                   BLA00301                   1/16/2024          0.039
## 13                   BLA00702                   1/29/2024          0.016
## 14                     BLA002                   1/29/2024          0.031
## 15                   BLA00204                    1/9/2024          0.043
## 16                   BLA00705                   1/22/2024          0.022
## 17                   BLA00202                   1/22/2024          0.032
## 18                   BLA00201                    2/5/2024          0.028
## 19                     BLA007                    2/5/2024          0.024
## 20                   BLA00706                   2/12/2024          0.013
## 21                   BLA00101                   2/12/2024          0.024
## 22                   BLA00101                   2/12/2024          0.024
## 23                   BLA00102                   2/20/2024          0.006
## 24                     BLA002                   2/28/2024          0.034
## 25                     BLA003                   2/28/2024          0.032
## 26                     BLA004                   2/28/2024          0.023
## 27                     BLA001                    3/4/2024          0.020
##    Result_Value_N relative_percent_difference    group highlight
## 1           0.041                    0.000000 evaluate          
## 2           0.041                    0.000000 evaluate          
## 3           0.050                   24.719101 evaluate highlight
## 4           0.041                    5.000000 evaluate          
## 5           0.015                    0.000000 accepted          
## 6           0.023                    4.255319 accepted          
## 7           0.030                    6.896552 evaluate          
## 8           0.014                    6.896552 accepted          
## 9           0.030                    6.896552 evaluate          
## 10          0.018                    0.000000 accepted          
## 11             NA                          NA     <NA>      <NA>
## 12          0.034                   13.698630 evaluate          
## 13          0.015                    6.451613 accepted          
## 14          0.031                    0.000000 evaluate          
## 15          0.043                    0.000000 evaluate          
## 16          0.022                    0.000000 accepted          
## 17          0.032                    0.000000 evaluate          
## 18          0.026                    7.407407 evaluate          
## 19          0.023                    4.255319 accepted          
## 20          0.014                    7.407407 accepted          
## 21          0.023                    4.255319 accepted          
## 22          0.015                   46.153846 accepted highlight
## 23          0.006                    0.000000 accepted          
## 24             NA                          NA     <NA>      <NA>
## 25          0.033                    3.076923 evaluate          
## 26          0.022                    4.444444 accepted          
## 27          0.020                    0.000000 accepted
#Values highlighted in table are REJ

RPD for samples needs to be evaluated in context. Results returned in the table as “accepted” will be accepted regardless of RPD because the sample concentrations are close to the detection limit.

Those labeled “evaluate” should be assessed based on both the RPD value and what is known about the site heterogeneity.

BLA005, collected on 7/25/2023, does not meet quality standards and will be removed from the analysis.

E. coli: Bacteria samples with low counts tend to have higher variability. Therefore, EC sample pairs (sample and field duplicate) will be separated into two groups:

• “low count samples” where the pair mean ≤ 20 MPN/100 mL and • “higher count samples” where the pair mean > 20 MPN/100 mL.

For precision of bacteria field replicates: • 50% of the replicate pairs must be at or below 20% RPD • 90% of the pairs must be at or below 50% RPD

# Filter for E. coli
ecoli_data <- subset(df, Result_Parameter_Name == 'E. coli')

#Filter for records without REJ quaifier (to be done after running with entire dataset)
#ecoli_data <- ecoli_data[ecoli_data$Result_Data_Qualifier != "REJ", ]

# Filter for Sample_Replicate_Flag == "Y"
replicate_Y_data <- ecoli_data %>%
  filter(Sample_Replicate_Flag == "Y")

# Pair rows for Sample_Replicate_Flag == "Y" with Sample_Replicate_Flag == "N"
paired_df <- replicate_Y_data %>%
  left_join(ecoli_data %>%
              filter(Sample_Replicate_Flag == "N"),
            by = c("Study_Specific_Location_ID", "Field_Collection_Start_Date"),
            suffix = c("_Y", "_N"))

# Calculate the relative percent difference between the two paired values
paired_df <- paired_df %>%
  mutate(relative_percent_difference = abs(Result_Value_Y - Result_Value_N) / ((Result_Value_Y + Result_Value_N) / 2) * 100)

EC_result_table <- paired_df %>%
  select(Study_Specific_Location_ID, Field_Collection_Start_Date, Result_Value_Y, Result_Value_N, relative_percent_difference)

# Return the table
EC_result_table
##    Study_Specific_Location_ID Field_Collection_Start_Date Result_Value_Y
## 1                      BLA003                   7/12/2023            410
## 2                      BLA004                   7/20/2023            365
## 3                      BLA005                   7/25/2023             63
## 4                    BLA00203                   9/14/2023            186
## 5                      BLA007                   9/14/2023            816
## 6                      BLA008                  10/12/2023              4
## 7                      BLA009                  10/12/2023            127
## 8                    BLA00502                  11/27/2023              3
## 9                      BLA001                  12/14/2023             15
## 10                     BLA009                  12/14/2023              9
## 11                   BLA00204                    1/9/2024             29
## 12                     BLA002                   1/11/2024             49
## 13                     BLA004                   1/11/2024              8
## 14                     BLA002                   1/22/2024             51
## 15                   BLA00301                   1/16/2024             93
## 16                     BLA002                   1/29/2024              3
## 17                   BLA00201                    2/5/2024             20
## 18                   BLA00705                   1/22/2024             44
## 19                   BLA00702                   1/29/2024              2
## 20                   BLA00706                   2/12/2024            206
## 21                   BLA00102                   1/20/2024              4
## 22                   BLA00101                   2/12/2024             41
## 23                     BLA001                    3/4/2024             19
## 24                   BLA00103                   3/11/2024              3
## 25                     BLA003                   2/28/2024            260
## 26                     BLA004                   2/28/2024             77
## 27                   BLA00902                   3/18/2024              5
## 28                   BLA00902                   3/25/2024              4
## 29                     BLA005                   3/27/2024             20
## 30                     BLA008                   3/27/2024              4
## 31                   BLA00602                    4/1/2024            126
## 32                   BLA00712                    4/1/2024             31
## 33                     BLA007                    4/8/2024            276
## 34                   BLA00602                   4/15/2024             24
##    Result_Value_N relative_percent_difference
## 1             652                   45.574388
## 2             236                   42.928453
## 3              81                   25.000000
## 4             214                   14.000000
## 5             980                   18.262806
## 6               6                   40.000000
## 7              27                  129.870130
## 8               3                    0.000000
## 9              10                   40.000000
## 10              9                    0.000000
## 11             52                   56.790123
## 12             45                    8.510638
## 13             10                   22.222222
## 14             46                   10.309278
## 15            111                   17.647059
## 16              6                   66.666667
## 17             26                   26.086957
## 18             56                   24.000000
## 19              4                   66.666667
## 20            166                   21.505376
## 21              1                  120.000000
## 22             46                   11.494253
## 23             20                    5.128205
## 24             12                  120.000000
## 25            238                    8.835341
## 26             50                   42.519685
## 27              5                    0.000000
## 28              5                   22.222222
## 29             28                   33.333333
## 30              1                  120.000000
## 31             99                   24.000000
## 32              1                  187.500000
## 33            184                   40.000000
## 34             28                   15.384615
# Assign a group based on the average of the Result_Value_Replicate_Y and Result_Value_Replicate_N
EC_result_table$group <- ifelse((EC_result_table$Result_Value_Y + EC_result_table$Result_Value_N) / 2 <= 20,
                             "low count samples", "higher count samples")

# Calculate the percent of values with Relative_Percent_Difference less than or equal to 20 by group
percent_diff_20 <- with(EC_result_table, tapply(relative_percent_difference <= 20, group, mean) * 100)

# Calculate the percent of values with Relative_Percent_Difference less than or equal to 50 by group
percent_diff_50 <- with(EC_result_table, tapply(relative_percent_difference <= 50, group, mean) * 100)

# Combine the results into a data frame
percent_diff_results <- data.frame(
  Group = c("low count samples", "higher count samples"),
  Percent_Diff_LE_20 = c(percent_diff_20["low count samples"], percent_diff_20["higher count samples"]),
  Percent_Diff_LE_50 = c(percent_diff_50["low count samples"], percent_diff_50["higher count samples"])
)

# Return the results
percent_diff_results
##                                     Group Percent_Diff_LE_20 Percent_Diff_LE_50
## low count samples       low count samples           28.57143           57.14286
## higher count samples higher count samples           40.00000           90.00000
# • 50% of the replicate pairs must be at or below 20% RPD 
# • 90% of the pairs must be at or below 50% RPD

Determine which replicates are invalid. Investigate corrective actions.

50% of replicate pairs must be at or below 20% RPD. Currently, 40% of low count samples area and 37.5% of high count samples are.

90% of replicate pairs must be at or below 50% RPD. Currently, 100% of low count samples are and 75% of high count samples are.

Low count samples with non-anomalous larger RPD values will be included in analyses for the time being because the data set is currently incomplete and when more replicate samples have been collected, the percent of samples meeting the thresholds may meet quality standards. A final determination will be made when all data have been collected and the project concluded in 2025.

Analysis should be re-run with the corrected dataframe to see how quality assessment improves.


Data Preparation

Prep data: Remove all duplicate records and any other rejected records:

# Create a new dataframe with the specified conditions
new_df <- df[!(df$Result_Data_Qualifier == "REJ" | df$Sample_Replicate_Flag == "Y"), ]

Define the date field so it may be recognized as a linear measure of time:

##Convert date class
new_df$Field_Collection_Start_Date
new_df <- new_df %>% 
  mutate(Field_Collection_Start_Date = mdy(Field_Collection_Start_Date))
head(new_df)
new_df$Field_Collection_Start_Date

Add a new field, “Season”, defined by the Field_Collection_Start_Date:

new_df$Season <- ifelse(format(as.Date(new_df$Field_Collection_Start_Date), "%m") %in% c("05", "06", "07", "08", "09"), "Dry", "Wet")

new_df

Save the prepped dataset as an object

save(new_df, file="J:/Git_WS/R-Code-Sample/Inputs/new_df.rda")
dir.create("O:/EH_Health/Surface Water/+ PIC/Black Lake Grant 2022-25/Data", recursive = TRUE, showWarnings = FALSE)
write.csv(new_df, file="O:/EH_Health/Surface Water/+ PIC/Black Lake Grant 2022-25/Data/Black_Lake_Data_Prepped.csv", row.names=FALSE)

Calculate Statistical Quantities

Statistical quantities include measures of: - central tendency (mean, median, etc) - Relative standing (percentiles) - Dispersion (range, variance) - Association (correlation)

Review these quantities to determine: - Do the data look reasonable - do the values make sense? - Are there any obvious anomalies? - Are there any trends or patterns?

Look at numeric distributions:

# Plot numeric distributions of Result_Value by Result_Parameter_Name
ggplot(new_df, aes(x = Result_Value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~Result_Parameter_Name, scales = "free_x") +
  theme_minimal() +
  xlab("Result Value") +
  ylab("Frequency") +
  ggtitle("Distributions of Result Value by Parameter Name")

##check normality
# Check the normality of Result_Value by Result_Parameter_Name using QQ plots
ggplot(new_df, aes(sample = Result_Value)) +
  geom_qq() +
  geom_qq_line() +
  facet_wrap(~Result_Parameter_Name) +
  theme_minimal() +
  xlab("Theoretical Quantiles") +
  ylab("Sample Quantiles") +
  ggtitle("QQ Plots of Result Value by Parameter Name")

Calculate some preliminary statistics.

Return the number of samples taken by site:

#how many samples were taken?
nrow(new_df)
## [1] 461
#461

# Make table summary
summary <- new_df %>%
  summarise(
    `Total number E. coli Samples Taken` = sum(Result_Parameter_Name == "E. coli"),
    `Total number of Total Phosphorus Samples Taken` = sum(Result_Parameter_Name == "Total Phosphorus"),
    )

summary
##   Total number E. coli Samples Taken
## 1                                273
##   Total number of Total Phosphorus Samples Taken
## 1                                            188
samplexsite <- table(new_df$Result_Parameter_Name, new_df$Study_Specific_Location_ID)
samplexsite
##                   
##                    BLA001 BLA00101 BLA00102 BLA00103 BLA00104 BLA00105 BLA002
##   E. coli              18        4        3        3        4        4     19
##   Total Phosphorus     17        5        4        4        4        3     15
##                   
##                    BLA00201 BLA00202 BLA00203 BLA00204 BLA003 BLA00301 BLA004
##   E. coli                 9        9        9        8     21        9     16
##   Total Phosphorus        5        5        5        5     18        5     13
##                   
##                    BLA005 BLA00501 BLA00502 BLA006 BLA00601 BLA00602 BLA007
##   E. coli              17        3        3     13        5        3     21
##   Total Phosphorus     12        0        0      7        0        0     18
##                   
##                    BLA00701 BLA00702 BLA00703 BLA00704 BLA00705 BLA00706
##   E. coli                 1        3        3        7        4        4
##   Total Phosphorus        1        5        3        5        5        5
##                   
##                    BLA00707 BLA00708 BLA00709 BLA00710 BLA00712 BLA008 BLA009
##   E. coli                 4        1        1        3        2      7     12
##   Total Phosphorus        5        1        0        0        0      7      6
##                   
##                    BLA00902 BLA00903 BLA00904 BLA00905 BLA011
##   E. coli                 5        5        5        3      2
##   Total Phosphorus        0        0        0        0      0

Calculate the geomean by routine site and season for E. coli:

#Remove all Segmentation samples and filter for E. coli only
df_filtered <- new_df %>%
  filter(Field_Collection_Comment != "Segmentation", Result_Parameter_Name == "E. coli")

# Filter for "Wet" and "Dry" seasons only
df_season_filtered <- df_filtered %>%
  filter(Season %in% c("Wet", "Dry"))

# Define a function to calculate the geometric mean
geomean <- function(x) exp(mean(log(x), na.rm = TRUE))

# Aggregate by season and study_specific_location_id and calculate the geomean and counts
aggregated_df <- df_season_filtered %>%
  group_by(Study_Specific_Location_ID, Season, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees) %>%
  summarise(
    Geomean = geomean(Result_Value),
    `Total Number of Samples` = n(),
    `% of Samples above 100` = sum(Result_Value >= 100) / n() * 100,
    `% of Samples above 320` = sum(Result_Value >= 320) / n() * 100,
    .groups = 'drop'
  )

aggregated_df
## # A tibble: 19 × 8
##    Study_Specific_Locatio…¹ Season Latitude_Decimal_Deg…² Longitude_Decimal_De…³
##    <chr>                    <chr>                   <dbl>                  <dbl>
##  1 BLA001                   Dry                      47.0                  -123.
##  2 BLA001                   Wet                      47.0                  -123.
##  3 BLA002                   Dry                      47.0                  -123.
##  4 BLA002                   Wet                      47.0                  -123.
##  5 BLA003                   Dry                      47.0                  -123.
##  6 BLA003                   Wet                      47.0                  -123.
##  7 BLA004                   Dry                      47.0                  -123.
##  8 BLA004                   Wet                      47.0                  -123.
##  9 BLA005                   Dry                      47.0                  -123.
## 10 BLA005                   Wet                      47.0                  -123.
## 11 BLA006                   Dry                      47.0                  -123.
## 12 BLA006                   Wet                      47.0                  -123.
## 13 BLA007                   Dry                      47.0                  -123.
## 14 BLA007                   Wet                      47.0                  -123.
## 15 BLA008                   Dry                      47.0                  -123.
## 16 BLA008                   Wet                      47.0                  -123.
## 17 BLA009                   Dry                      47.0                  -123.
## 18 BLA009                   Wet                      47.0                  -123.
## 19 BLA011                   Dry                      46.9                  -123.
## # ℹ abbreviated names: ¹​Study_Specific_Location_ID, ²​Latitude_Decimal_Degrees,
## #   ³​Longitude_Decimal_Degrees
## # ℹ 4 more variables: Geomean <dbl>, `Total Number of Samples` <int>,
## #   `% of Samples above 100` <dbl>, `% of Samples above 320` <dbl>

Calculate the geomean by routine site and season for Total Phosphorus:

df_tp <- new_df %>%
  filter(Field_Collection_Comment != "Segmentation", Result_Parameter_Name == "Total Phosphorus")

# Filter for "Wet" and "Dry" seasons only
df_season_tp <- df_tp %>%
  filter(Season %in% c("Wet", "Dry"))

# Define a function to calculate the geometric mean
geomean <- function(x) exp(mean(log(x), na.rm = TRUE))

# Aggregate by season and study_specific_location_id and calculate the geomean and counts
aggregated_df_tp <- df_season_tp %>%
  group_by(Study_Specific_Location_ID, Season, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees) %>%
  summarise(
    Geomean = geomean(Result_Value),
    `Total Number of Samples` = n(),
    `% of Samples above 0.01mg/L` = sum(Result_Value >= 0.01) / n() * 100,
    .groups = 'drop'
  )

aggregated_df_tp
## # A tibble: 18 × 7
##    Study_Specific_Locatio…¹ Season Latitude_Decimal_Deg…² Longitude_Decimal_De…³
##    <chr>                    <chr>                   <dbl>                  <dbl>
##  1 BLA001                   Dry                      47.0                  -123.
##  2 BLA001                   Wet                      47.0                  -123.
##  3 BLA002                   Dry                      47.0                  -123.
##  4 BLA002                   Wet                      47.0                  -123.
##  5 BLA003                   Dry                      47.0                  -123.
##  6 BLA003                   Wet                      47.0                  -123.
##  7 BLA004                   Dry                      47.0                  -123.
##  8 BLA004                   Wet                      47.0                  -123.
##  9 BLA005                   Dry                      47.0                  -123.
## 10 BLA005                   Wet                      47.0                  -123.
## 11 BLA006                   Dry                      47.0                  -123.
## 12 BLA006                   Wet                      47.0                  -123.
## 13 BLA007                   Dry                      47.0                  -123.
## 14 BLA007                   Wet                      47.0                  -123.
## 15 BLA008                   Dry                      47.0                  -123.
## 16 BLA008                   Wet                      47.0                  -123.
## 17 BLA009                   Dry                      47.0                  -123.
## 18 BLA009                   Wet                      47.0                  -123.
## # ℹ abbreviated names: ¹​Study_Specific_Location_ID, ²​Latitude_Decimal_Degrees,
## #   ³​Longitude_Decimal_Degrees
## # ℹ 3 more variables: Geomean <dbl>, `Total Number of Samples` <int>,
## #   `% of Samples above 0.01mg/L` <dbl>
#add Result_parameter_name column
aggregated_df_tp$Result_Parameter_Name <- "Total Phosphorus"

#add Meets_Standards column
aggregated_df_tp$`Meets_Standards` <- ifelse(aggregated_df_tp$Geomean < 0.01, "Yes", "No")

Calculate E. coli geomean for segmented sites by season:

#Repeat for segmented sites
df_seg <- new_df %>%
  filter(Field_Collection_Comment == "Segmentation", Result_Parameter_Name == "E. coli")

# Filter for "Wet" and "Dry" seasons only
df_seg_filtered <- df_seg %>%
  filter(Season %in% c("Wet", "Dry"))

# Define a function to calculate the geometric mean
geomean <- function(x) exp(mean(log(x), na.rm = TRUE))

# Aggregate by season and study_specific_location_id and calculate the geomean and counts
aggregated_df_seg <- df_seg_filtered %>%
  group_by(Study_Specific_Location_ID, Season, Latitude_Decimal_Degrees, Longitude_Decimal_Degrees) %>%
  summarise(
    Geomean = geomean(Result_Value),
    `Total Number of Samples` = n(),
    `% of Samples above 100` = sum(Result_Value >= 100) / n() * 100,
    `% of Samples above 320` = sum(Result_Value >= 320) / n() * 100,
    .groups = 'drop'
  )

aggregated_df_seg
## # A tibble: 44 × 8
##    Study_Specific_Locatio…¹ Season Latitude_Decimal_Deg…² Longitude_Decimal_De…³
##    <chr>                    <chr>                   <dbl>                  <dbl>
##  1 BLA001                   Wet                      47.0                  -123.
##  2 BLA00101                 Wet                      47.0                  -123.
##  3 BLA00102                 Wet                      47.0                  -123.
##  4 BLA00103                 Wet                      47.0                  -123.
##  5 BLA00104                 Wet                      47.0                  -123.
##  6 BLA00105                 Wet                      47.0                  -123.
##  7 BLA002                   Dry                      47.0                  -123.
##  8 BLA002                   Wet                      47.0                  -123.
##  9 BLA00201                 Dry                      47.0                  -123.
## 10 BLA00201                 Wet                      47.0                  -123.
## # ℹ 34 more rows
## # ℹ abbreviated names: ¹​Study_Specific_Location_ID, ²​Latitude_Decimal_Degrees,
## #   ³​Longitude_Decimal_Degrees
## # ℹ 4 more variables: Geomean <dbl>, `Total Number of Samples` <int>,
## #   `% of Samples above 100` <dbl>, `% of Samples above 320` <dbl>

Now merge geomean tables into a dataframe to be used in the dashboard and save it as a csv

#Merge E.coli tables
Black_Lake_Geomean_Table <- rbind(aggregated_df, aggregated_df_seg)

#Add columns "Result_Parameter_Name" and "Meets standards"
Black_Lake_Geomean_Table$Result_Parameter_Name <- "E. coli"
Black_Lake_Geomean_Table$`Meets_Standards` <- ifelse(Black_Lake_Geomean_Table$Geomean < 100 | Black_Lake_Geomean_Table$`% of Samples above 320` < 10, "Yes", "No")

#Change data type of Black Lake Geomean 'Meets Standards'
Black_Lake_Geomean_Table$Meets_Standards <- as.character(Black_Lake_Geomean_Table$Meets_Standards)

#Merge TP table
Black_Lake_Geomean_Table <-merge(Black_Lake_Geomean_Table, aggregated_df_tp, all.x = TRUE)
Black_Lake_Geomean_Table <- bind_rows(Black_Lake_Geomean_Table, aggregated_df_tp)

#Export to csv
write.csv(Black_Lake_Geomean_Table, file="O:/EH_Health/Surface Water/+ PIC/Projects/Black Lake Grant 2022-25/Data/Support Data/Black_Lake_Geomean_Table.csv", row.names=FALSE)

Display the data using graphs

Review graphs to determine: - Do the data look reasonable? - What is the distribution like? Is it symmetric, bimodal, etc? - Are there extremely high or extremely low values? - Are there any obvious trends?

Create a boxplot using Plotly for E. coli. The plot is interactive.

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Create a boxplot using Plotly for Total Phosphorus

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning: 'layout' objects don't have these attributes: 'boxmode'
## Valid attributes include:
## '_deprecated', 'activeshape', 'annotations', 'autosize', 'autotypenumbers', 'calendar', 'clickmode', 'coloraxis', 'colorscale', 'colorway', 'computed', 'datarevision', 'dragmode', 'editrevision', 'editType', 'font', 'geo', 'grid', 'height', 'hidesources', 'hoverdistance', 'hoverlabel', 'hovermode', 'images', 'legend', 'mapbox', 'margin', 'meta', 'metasrc', 'modebar', 'newshape', 'paper_bgcolor', 'plot_bgcolor', 'polar', 'scene', 'selectdirection', 'selectionrevision', 'separators', 'shapes', 'showlegend', 'sliders', 'smith', 'spikedistance', 'template', 'ternary', 'title', 'transition', 'uirevision', 'uniformtext', 'updatemenus', 'width', 'xaxis', 'yaxis', 'barmode', 'bargap', 'mapType'

Create plots for each E.coli and Total Phosphorus results over time. The plots are interactive and you can click on the series to the right to hide or show them. Static pngs are also available for download.

library("viridis")
## Warning: package 'viridis' was built under R version 4.3.3
## Loading required package: viridisLite
# Filter the data for the specific location ID and parameter name
df_BLA001_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA001", ]

# Create the line graph
ggplot(df_BLA001_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "E. coli (MPN/100mL)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA001") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 120, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA001_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA001", ]

# Create the line graph
ggplot(df_BLA001_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "Total Phosphorus (mg/L)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of Total Phosphorus in Streamwater at BLA001") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.005, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`

# Filter the data for the specific location ID and parameter name
df_BLA002_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA002", ]

# Create the line graph
ggplot(df_BLA002_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "E. coli (MPN/100mL)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA002") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 120, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA002_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA002", ]

# Create the line graph
ggplot(df_BLA002_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "Total Phosphorus (mg/L)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of Total Phosphorus in Streamwater at BLA002") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.012, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`

# Filter the data for the specific location ID and parameter name
df_BLA003_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA003", ]

# Create the line graph
ggplot(df_BLA003_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "E. coli (MPN/100mL)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA003") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 140, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA003_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA003", ]

# Create the line graph
ggplot(df_BLA003_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "Total Phosphorus (mg/L)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of Total Phosphorus in Streamwater at BLA003") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.012, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`

# Filter the data for the specific location ID and parameter name
df_BLA004_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA004", ]

# Create the line graph
ggplot(df_BLA004_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "E. coli (MPN/100mL)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA004") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 120, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA004_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA004", ]

# Create the line graph
ggplot(df_BLA004_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "Total Phosphorus (mg/L)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of Total Phosphorus in Streamwater at BLA004") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.012, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`

# Filter the data for the specific location ID and parameter name
df_BLA005_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA005", ]

# Create the line graph
ggplot(df_BLA005_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "E. coli (MPN/100mL)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA005") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 145, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA005_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA005", ]

# Create the line graph
ggplot(df_BLA005_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "Total Phosphorus (mg/L)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of Total Phosphorus in Streamwater at BLA005") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_hline(yintercept = 0.01, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.011, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

# Filter the data for the specific location ID and parameter name
df_BLA006_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA006", ]

# Create the line graph
ggplot(df_BLA006_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "E. coli (MPN/100mL)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA006") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 150, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA006_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA006", ]

# Create the line graph
ggplot(df_BLA006_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "Total Phosphorus (mg/L)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of Total Phosphorus in Streamwater at BLA006") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.016, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`

# Filter the data for the specific location ID and parameter name
df_BLA007_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA007", ]

# Create the line graph
ggplot(df_BLA007_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "E. coli (MPN/100mL)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA007") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 155, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA007_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA007", ]

# Create the line graph
ggplot(df_BLA007_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "Total Phosphorus (mg/L)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of Total Phosphorus in Streamwater at BLA007") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(color="darkorange3", intercept=0.01, slope=0, stat="Water Quality Standard") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.012, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)
## Warning in geom_abline(color = "darkorange3", intercept = 0.01, slope = 0, :
## Ignoring unknown parameters: `stat`

# Filter the data for the specific location ID and parameter name
df_BLA009_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA009", ]

# Create the line graph
ggplot(df_BLA009_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "E. coli (MPN/100mL)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA009") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_abline(intercept = 100, slope= 0, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 120, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA009_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA009", ]

# Create the line graph
ggplot(df_BLA009_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "Total Phosphorus (mg/L)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of Total Phosphorus in Streamwater at BLA009") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_hline(yintercept = 0.01, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.011, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

# Filter the data for the specific location ID and parameter name
df_BLA008_EC <- new_df[new_df$Result_Parameter_Name == "E. coli" & new_df$Study_Specific_Location_ID == "BLA008", ]

# Create the line graph
ggplot(df_BLA008_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "E. coli (MPN/100mL)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of E. coli Bacteria in Streamwater at BLA008") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_hline(yintercept = 100, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 95, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1)

#
#
#
# Filter the data for the specific location ID and parameter name
df_BLA008_EC <- new_df[new_df$Result_Parameter_Name == "Total Phosphorus" & new_df$Study_Specific_Location_ID == "BLA008", ]

# Create the line graph
ggplot(df_BLA008_EC, aes(x = Field_Collection_Start_Date, y = Result_Value)) +
  geom_line(linewidth = 0.9, color = "darkcyan") +
  labs(x = "Month", y = "Total Phosphorus (mg/L)") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.title = element_text(size = 12, face = "bold")
  ) +
  ggtitle("Concentration of Total Phosphorus in Streamwater at BLA008") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  geom_hline(yintercept = 0.01, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.0105, label = "Water Quality Standard", color = "darkorange3", size = 3.5, hjust = 1) +
  geom_hline(yintercept = 0.0195, color = "darkorange3") +
  annotate("text", x = as.Date("2024-05-01"), y = 0.02, label = "TMDL Target", color = "darkorange3", size = 3.5, hjust = 1)

# Load the necessary library
#library(dplyr)

# Filter for relevant parameters
filtered_df <- new_df %>%
  filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus"))

# Create pairs based on Field_Collection_Start_Date and Study_Specific_Location_ID
paired_data <- filtered_df %>% group_by(Study_Specific_Location_ID, Field_Collection_Start_Date) %>% filter(n() >= 2) %>% pivot_wider(names_from = Result_Parameter_Name, values_from = Result_Value)

# Remove groups that do not have both parameters
paired_data <- paired_data %>%
  filter(!is.na(`E. coli`) & !is.na(`Total Phosphorus`))

valid_data <- paired_data %>% group_by(Study_Specific_Location_ID) %>% filter(n() > 1) %>% summarise(correlation = ifelse(n() > 1, cor(`E. coli`, `Total Phosphorus`, use = "complete.obs"), NA), p_value = ifelse(n() > 1, cor.test(`E. coli`, `Total Phosphorus`, method = "pearson")$p.value, NA))

# Test for association within each group
#association_tests <- paired_data %>%
 # group_by(Study_Specific_Location_ID) %>%
  #summarize(correlation = cor(`E. coli`, `Total Phosphorus`, use = "complete.obs"),
   #         p_value = cor.test(`E. coli`, `Total Phosphorus`, method = "pearson")$p.value)

# Output the results
#print(association_tests)
print(valid_data)
## # A tibble: 0 × 3
## # ℹ 3 variables: Study_Specific_Location_ID <chr>, correlation <lgl>,
## #   p_value <lgl>
# Filter dataframe for Study_Specific_Location_ID = BLA001
BLA001_df <- new_df %>% 
  filter(Study_Specific_Location_ID == "BLA001")

# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA001_df %>%
  filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
  group_by(Field_Collection_Start_Date) %>%
  summarise(
    E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
    Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
  ) %>%
  filter(E_coli > 0 & Total_Phosphorus > 0)

# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")

# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef), 
           hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
  labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)", 
       title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA001)")
## `geom_smooth()` using formula = 'y ~ x'

# Filter dataframe for Study_Specific_Location_ID = BLA002
BLA002_df <- new_df %>% 
  filter(Study_Specific_Location_ID == "BLA002")

# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA002_df %>%
  filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
  group_by(Field_Collection_Start_Date) %>%
  summarise(
    E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
    Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
  ) %>%
  filter(E_coli > 0 & Total_Phosphorus > 0)

# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")

# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef), 
           hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
  labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)", 
       title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA002)")
## `geom_smooth()` using formula = 'y ~ x'

# Filter dataframe for Study_Specific_Location_ID = BLA003
BLA003_df <- new_df %>% 
  filter(Study_Specific_Location_ID == "BLA003")

# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA003_df %>%
  filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
  group_by(Field_Collection_Start_Date) %>%
  summarise(
    E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
    Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
  ) %>%
  filter(E_coli > 0 & Total_Phosphorus > 0)

# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")

# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef), 
           hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
  labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)", 
       title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA003)")
## `geom_smooth()` using formula = 'y ~ x'

# Filter dataframe for Study_Specific_Location_ID = BLA004
BLA004_df <- new_df %>% 
  filter(Study_Specific_Location_ID == "BLA004")

# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA004_df %>%
  filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
  group_by(Field_Collection_Start_Date) %>%
  summarise(
    E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
    Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
  ) %>%
  filter(E_coli > 0 & Total_Phosphorus > 0)

# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")

# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef), 
           hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
  labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)", 
       title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA004)")
## `geom_smooth()` using formula = 'y ~ x'

# Filter dataframe for Study_Specific_Location_ID = BLA005
BLA005_df <- new_df %>% 
  filter(Study_Specific_Location_ID == "BLA005")

# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA005_df %>%
  filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
  group_by(Field_Collection_Start_Date) %>%
  summarise(
    E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
    Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
  ) %>%
  filter(E_coli > 0 & Total_Phosphorus > 0)

# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")

# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef), 
           hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
  labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)", 
       title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA005)")
## `geom_smooth()` using formula = 'y ~ x'

# Filter dataframe for Study_Specific_Location_ID = BLA006
BLA006_df <- new_df %>% 
  filter(Study_Specific_Location_ID == "BLA006")

# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA006_df %>%
  filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
  group_by(Field_Collection_Start_Date) %>%
  summarise(
    E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
    Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
  ) %>%
  filter(E_coli > 0 & Total_Phosphorus > 0)

# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")

# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef), 
           hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
  labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)", 
       title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA006)")
## `geom_smooth()` using formula = 'y ~ x'

# Filter dataframe for Study_Specific_Location_ID = BLA007
BLA007_df <- new_df %>% 
  filter(Study_Specific_Location_ID == "BLA002")

# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA007_df %>%
  filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
  group_by(Field_Collection_Start_Date) %>%
  summarise(
    E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
    Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
  ) %>%
  filter(E_coli > 0 & Total_Phosphorus > 0)

# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")

# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef), 
           hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
  labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)", 
       title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA007)")
## `geom_smooth()` using formula = 'y ~ x'

# Filter dataframe for Study_Specific_Location_ID = BLA009
BLA009_df <- new_df %>% 
  filter(Study_Specific_Location_ID == "BLA009")

# Pair records of E. coli and Total Phosphorus based on Field_Collection_Start_Date
paired_df <- BLA009_df %>%
  filter(Result_Parameter_Name %in% c("E. coli", "Total Phosphorus")) %>%
  group_by(Field_Collection_Start_Date) %>%
  summarise(
    E_coli = sum(Result_Value[Result_Parameter_Name == "E. coli"]),
    Total_Phosphorus = sum(Result_Value[Result_Parameter_Name == "Total Phosphorus"])
  ) %>%
  filter(E_coli > 0 & Total_Phosphorus > 0)

# Calculate correlation coefficient
cor_coef <- cor(paired_df$E_coli, paired_df$Total_Phosphorus, use = "complete.obs")

# Add correlation to the ggplot
ggplot(paired_df, aes(x = E_coli, y = Total_Phosphorus)) +
  geom_point() +
  geom_smooth(method = "lm", color = "blue") +
  annotate("text", x = Inf, y = Inf, label = sprintf("Correlation: %.2f", cor_coef), 
           hjust = 1.1, vjust = 1.1, size = 5, color = "red") +
  labs(x = "E. coli (CFU or MPN per 100 mL)", y = "Total Phosphorus (mg/L)", 
       title = "Scatterplot of E. coli vs Total Phosphorus with Correlation (BLA009)")
## `geom_smooth()` using formula = 'y ~ x'


From the review of this report, we should gain a sense of data quality. Our prepped dataset should now meet our acceptance criteria.

Further assessment still needs to be done: Data should be of appropriate quality to meet both our performance and acceptance criteria. Performance criteria needs to be established and appropriate amendments and analyses should follow before decision-making.

knitr::opts_chunk$set(echo = TRUE)